# Create data aggregated to County level ----------
library(tidyverse)
library(openintro)
library(stringr)
library(foreign)

options(scipen=0, digits=7) #to standardize number of digits when exporting data. all data here is run using these options.

controlled_subset <- FALSE

if(controlled_subset){
  load("data/sum_LESO_agency_controlled.RData")
} else {
  load("data/sum_LESO_agency.RData")
}
milit <- complete
rm(complete)

# load crime,demog and econ variables
load("data/county-crime-controls.RData")

crimecontrol <- crimecontrol %>% filter(!is.na(County))
# 3,127 unique county/state pairs
#and, have to make sure it's a complete sample to match our timeframe
crimecontrol <- crimecontrol %>% 
  group_by(County,state) %>% 
  complete(year = full_seq(2009:2014, 1))

military_county <- milit %>%
  group_by(County,State,year)%>%
  summarise_at(vars(contains("sum")), sum, na.rm=TRUE) 

mdata <- full_join(military_county,crimecontrol, by = c('year'='year',
                                              'County' = 'County',
                                              'State' = 'state'))%>%
  mutate_at(vars(sumawards_BGGears:sumallLESOquantityHARRIS), 
            ~replace(., is.na(.), 0)) %>% filter(year>2008 & year < 2016)

##merge in the milex dataset for IV's needed to conduct the analysis
milex <- read_csv("raw-Data/usmilex.csv")%>%
  mutate(lag_milex = dplyr::lag(usmilex,2))
# note this is lagged to t-2

mdata <- left_join(mdata,milex,by=c("year"="year"))

#calculate the IV used in B+G
df_county <- left_join(mdata,milex,by=c("year"="year"))%>%
  mutate(ind = ifelse(sumquantity_BGWeapons!=0 | 
                        sumquantity_BGVehicles!=0 |
                        sumquantity_BGGears!=0 |
                        sumquantity_BGOthers!=0,1,0),
         ind = if_else(is.na(ind), 0, ind))%>%
  select(County,State,year,everything()) %>% 
  arrange(State,County,year) 

df_county <- df_county %>% 
  group_by(County,State) %>% 
  summarise(Aid1 = sum(ind))%>%
  left_join(mdata,.) 

##need to create group indices for all agencies that could exist ----
df_county <- df_county %>% 
  group_by(County,State) %>% 
  mutate(g = group_indices())

# merge all together and create variables of interest
df_county <- df_county %>%
  group_by(County,State) %>%
  mutate(g = as.factor(as.character(g)),
         milex_iv = Aid1/5*log(lag_milex),
         sharemales = popmales/Population, ###finally, create the variables used in B+G : share of males/ages, logged population, medianincome
         share1519 = pop1519/Population,
         share2024 = pop2024/Population,
         share2534 = pop2534/Population,
         logpop=log(Population),
         logmedianhouseholdincome = log(medianhouseholdincome))%>% 
  arrange(State,County,year)

# Merge HARRIS instruments
harris <- read_csv("data/countyHARRISinstruments.csv")%>%
  rename(State_abbrv = State)%>%
  mutate(County = as.character(County)) 

skip <- which(harris$State_abbrv=="AK" | harris$State_abbrv=="LA")
harris$County[-skip] <- paste(harris$County[-skip], "County")
la <- which(harris$State_abbrv=="LA")
harris$County[la] <- paste(harris$County[la],'Parish')

df_county$State[is.na(df_county$State)] <- df_county$State[is.na(df_county$State)] 

df_county$County[df_county$County=='Baltimore city'] <- 'Baltimore city County'
df_county$County[df_county$County=='Carson City'] <- 'Carson City County'
df_county$County[df_county$County=='Do<f1>a Ana County'] <- 'Dona Ana County'

mdata <- df_county %>%
  mutate(state_abbrv = ifelse(is.na(state_abbrv),state2abbr(State),state_abbrv))%>%
  left_join(.,harris,
                   by=c("County" = "County","state_abbrv"="State_abbrv"))

#------------------------------------------------------------------------------------#
#CREATE INSTRUMENTS WE WILL NEED FOR THE ANALYSIS FOR SUM US ITEMS AS DV. --------
#SumUSitems = the sum of all items given to all jurisdictions within a certain year.
#zHUdI1 = the interaction of the inverse distance to the closest distribution center with sumUSitems
#zHUdI6 = the interaction of the inverse distance to the sixth closest distribution center with sumUSitems
#zHUdIland = interaction between  the  availability  of  tactical  items  and  the  jurisdiction’s  land  area  
#zHUdIhidta = interaction between sumUSitems and  an  indicator  for  having  ever  been  designated  as  a  HIDTA
#------------------------------------------------------------------------------------#
SumUSitemsmdata <- mdata %>% 
  group_by(year)  %>%
  dplyr::summarize(SumUSitems=sum(sumallLESOquantityHARRIS,na.rm=T))

county <- left_join(mdata,SumUSitemsmdata)%>%
  mutate(zHUdI1 = (1/(closestFACs/1609.34))*SumUSitems,
         zHUdI6 = (1/(sixthclosestFACs/1609.34))*SumUSitems,
         zHUdIland = log(landareamiles) * SumUSitems,
         zHUdIhidta = IS_HIDTA * SumUSitems) 

#------------------------------------------------------------------------------------#
#CREATE INSTRUMENTS WE WILL NEED FOR THE ANALYSIS FOR SUM US VALUE AS DV. -------
#SumUSvalue = the sum of all items given to all jurisdictions within a certain year.
#zHUd1 = the interaction of the inverse distance to the closest distribution center with sumUSitems
#zHUd6 = the interaction of the inverse distance to the sixth closest distribution center with sumUSitems
#zHUdland = interaction between  the  availability  of  tactical  items  and  the  jurisdiction’s  land  area  
#zHUdhidta = interaction between sumUSitems and  an  indicator  for  having  ever  been  designated  as  a  HIDTA
#------------------------------------------------------------------------------------#
SumUSvaluemdata <- mdata %>% 
  group_by(year)  %>%
  dplyr::summarize(SumUSvalue=sum(sumallLESOaidHARRIS,na.rm=T)) %>%
  as.data.frame()

county <- left_join(county,SumUSvaluemdata)%>%
  mutate(zHUd1 = (1/(closestFACs/1609.34))*SumUSvalue,
         zHUd6 = (1/(sixthclosestFACs/1609.34))*SumUSvalue,
         zHUdland = log(landareamiles) * SumUSvalue,
         zHUdhitda = IS_HIDTA * SumUSvalue) %>%
  group_by(County,State)%>%
  mutate_at(vars(zHUd1,zHUd6,zHUdland,zHUdhitda,SumUSvalue,
                 zHUdI1,zHUdI6,zHUdIland,zHUdIhidta,sumallLESOaidBG,
                 sumallLESOaidHARRIS,sumallLESOquantityHARRIS,
                 MURDER_ArrestRate, RAPE_ArrestRate, ROBBERY_ArrestRate,
                 AGASSLT_ArrestRate,BURGLRY_ArrestRate,OTHASLT_ArrestRate,WEAPONS_ArrestRate,
                 DRGSALE_ArrestRate,DRGPOSS_ArrestRate,SumUSitems,SumUSvalue),
            funs(lag=dplyr::lag(.))) %>%
  filter(year>2009 & year<2015) 

fix_NA <- grep("sum",names(county),ignore.case = T)
county[ ,fix_NA][is.na(county[ ,fix_NA])] <- 0  
county <- county %>% filter(!is.na(year))

# generate state*year fixed effect
# and numeric state variable for Stata
county <- county %>%
  group_by(State) %>%
  mutate(state_numeric = group_indices()) %>%
  group_by(State,year) %>%
  mutate(state_year = group_indices())

# Save data ----------
if(controlled_subset){
  save(county,file="data/county-aggregate-controlled.RData")
} else {
  save(county,file="data/county-aggregate.RData")
  write.csv(county,file="data/county-aggregate.csv")
}

#and, save for stata to be able to calculate the f-stats
county$g <- as.numeric(county$g)
countystata <- county %>%
  group_by(State,year) %>%
  mutate(State_numeric = group_indices())

if(controlled_subset==FALSE){
  write.dta(countystata, file='data/county.dta') 
}


